Load Dataset

suic_dataset <- read.csv('master.csv', strip.white = TRUE)
suic_dataset <- suic_dataset[, -8]
str(suic_dataset)
## 'data.frame':    27820 obs. of  11 variables:
##  $ country           : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year              : int  1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
##  $ sex               : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
##  $ age               : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
##  $ suicides_no       : int  21 16 14 1 9 1 6 4 1 0 ...
##  $ population        : int  312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
##  $ suicides.100k.pop : num  6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
##  $ HDI.for.year      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gdp_for_year....  : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
##  $ gdp_per_capita....: int  796 796 796 796 796 796 796 796 796 796 ...
##  $ generation        : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...
head(suic_dataset, 10)
##    country year    sex         age suicides_no population
## 1  Albania 1987   male 15-24 years          21     312900
## 2  Albania 1987   male 35-54 years          16     308000
## 3  Albania 1987 female 15-24 years          14     289700
## 4  Albania 1987   male   75+ years           1      21800
## 5  Albania 1987   male 25-34 years           9     274300
## 6  Albania 1987 female   75+ years           1      35600
## 7  Albania 1987 female 35-54 years           6     278800
## 8  Albania 1987 female 25-34 years           4     257200
## 9  Albania 1987   male 55-74 years           1     137500
## 10 Albania 1987 female  5-14 years           0     311000
##    suicides.100k.pop HDI.for.year gdp_for_year.... gdp_per_capita....
## 1               6.71           NA    2,156,624,900                796
## 2               5.19           NA    2,156,624,900                796
## 3               4.83           NA    2,156,624,900                796
## 4               4.59           NA    2,156,624,900                796
## 5               3.28           NA    2,156,624,900                796
## 6               2.81           NA    2,156,624,900                796
## 7               2.15           NA    2,156,624,900                796
## 8               1.56           NA    2,156,624,900                796
## 9               0.73           NA    2,156,624,900                796
## 10              0.00           NA    2,156,624,900                796
##         generation
## 1     Generation X
## 2           Silent
## 3     Generation X
## 4  G.I. Generation
## 5          Boomers
## 6  G.I. Generation
## 7           Silent
## 8          Boomers
## 9  G.I. Generation
## 10    Generation X

Process Data

sapply(suic_dataset, function(x) all(is.na(x) || is.infinite(x)))
##            country               year                sex 
##              FALSE              FALSE              FALSE 
##                age        suicides_no         population 
##              FALSE              FALSE              FALSE 
##  suicides.100k.pop       HDI.for.year   gdp_for_year.... 
##              FALSE               TRUE              FALSE 
## gdp_per_capita....         generation 
##              FALSE              FALSE
temp <- suic_dataset
new_colnames <- c('country', 'year', 'sex', 'age', 'suicidect', 'pop', 'suiciderate', 'hdiyear', 'gdpyear', 'gdpcapita', 'gen')
colnames(temp) = new_colnames
custom_factor <- function(cols){
  for(i in cols){
    level_c <- levels(temp[[i]])
    label_c <- c()
    for(j in 0:length(level_c)) label_c[j] <- j
    temp[[i]] <<- factor(temp[[i]], labels = label_c, levels = level_c)
  }
}

cols_ <- colnames(temp[, sapply(temp, is.factor)])
cols_ <- cols_[!cols_ %in% c('country', 'gdpcapita', 'age')]
custom_factor(cols_)

agelevels <- levels(temp$age)
agelevels
## [1] "15-24 years" "25-34 years" "35-54 years" "5-14 years"  "55-74 years"
## [6] "75+ years"
agelabels <- c('gen_X', 'boomers', 'silent', 'gen_X', 'gen_GI', 'gen_GI')
temp$age <- factor(temp$age, levels = levels(temp$age), labels = c('gen_X', 'boomers', 'silent', 'gen_X', 'gen_GI', 'gen_GI'))

temp$gdpyear <- as.numeric(gsub(",","",levels(suic_dataset$gdp_for_year....), fixed = TRUE))[suic_dataset$gdp_for_year....]
temp$suiciderate <- (temp$suicidect / (temp$pop / 100000))

temp <- temp[, -11]
temp <- temp[!colnames(temp) %in% c('hdiyear')]
summary(temp)
##         country           year      sex            age      
##  Austria    :  382   Min.   :1985   1:13910   gen_X  :9252  
##  Iceland    :  382   1st Qu.:1995   2:13910   boomers:4642  
##  Mauritius  :  382   Median :2002             silent :4642  
##  Netherlands:  382   Mean   :2001             gen_GI :9284  
##  Argentina  :  372   3rd Qu.:2008                           
##  Belgium    :  372   Max.   :2016                           
##  (Other)    :25548                                          
##    suicidect            pop            suiciderate      
##  Min.   :    0.0   Min.   :     278   Min.   :  0.0000  
##  1st Qu.:    3.0   1st Qu.:   97498   1st Qu.:  0.9187  
##  Median :   25.0   Median :  430150   Median :  5.9909  
##  Mean   :  242.6   Mean   : 1844794   Mean   : 12.8161  
##  3rd Qu.:  131.0   3rd Qu.: 1486143   3rd Qu.: 16.6177  
##  Max.   :22338.0   Max.   :43805214   Max.   :224.9719  
##                                                         
##     gdpyear            gdpcapita     
##  Min.   :4.692e+07   Min.   :   251  
##  1st Qu.:8.985e+09   1st Qu.:  3447  
##  Median :4.811e+10   Median :  9372  
##  Mean   :4.456e+11   Mean   : 16866  
##  3rd Qu.:2.602e+11   3rd Qu.: 24874  
##  Max.   :1.812e+13   Max.   :126352  
## 

Preparing Data For Model

library(reshape)
totals <- function(data){
  tl_df <- data.frame(NA, NA, NA, NA)
  names(tl_df) <- c("country","year","total_pop", "total_suicide_ct")
  tl_df <- na.omit(tl_df)  
  for(i in unique(data$country)){
    for(j in unique(data$year)){
      tdf <- data.frame(i, j, sum(data[data$country == i & data$year == j, 'pop']), 
                        sum(data[data$country == i & data$year == j, 'suicidect']))
      names(tdf) <- c("country", "year","total_pop", "total_suicide_ct")
      tl_df <- rbind(tl_df, tdf)    
    }
  }
  country <- unique(tl_df$country)
  tl_df <- reshape(tl_df, direction = "wide", idvar = c("country"), timevar = "year")
  rownames(tl_df) <- unique(data$country)
  return(tl_df[-1])
}

df <- totals(temp)
head(df,1)
##         total_pop.1987 total_suicide_ct.1987 total_pop.1988
## Albania        2709600                    73        2764300
##         total_suicide_ct.1988 total_pop.1989 total_suicide_ct.1989
## Albania                    63        2803100                    68
##         total_pop.1992 total_suicide_ct.1992 total_pop.1993
## Albania        2822500                    47        2807300
##         total_suicide_ct.1993 total_pop.1994 total_suicide_ct.1994
## Albania                    73        2849300                    50
##         total_pop.1995 total_suicide_ct.1995 total_pop.1996
## Albania        2903400                    88        2940200
##         total_suicide_ct.1996 total_pop.1997 total_suicide_ct.1997
## Albania                    89        2977300                   170
##         total_pop.1998 total_suicide_ct.1998 total_pop.1999
## Albania        3012700                   154        3029700
##         total_suicide_ct.1999 total_pop.2000 total_suicide_ct.2000
## Albania                   139        2796300                    54
##         total_pop.2001 total_suicide_ct.2001 total_pop.2002
## Albania        2799349                   119        2818839
##         total_suicide_ct.2002 total_pop.2003 total_suicide_ct.2003
## Albania                   133        2843929                   124
##         total_pop.2004 total_suicide_ct.2004 total_pop.2005
## Albania        2874991                   146        2783320
##         total_suicide_ct.2005 total_pop.2006 total_suicide_ct.2006
## Albania                     0        2780176                     0
##         total_pop.2007 total_suicide_ct.2007 total_pop.2008
## Albania        2770344                   124        2757059
##         total_suicide_ct.2008 total_pop.2009 total_suicide_ct.2009
## Albania                   160        2745735                     0
##         total_pop.2010 total_suicide_ct.2010 total_pop.1985
## Albania        2736025                    96              0
##         total_suicide_ct.1985 total_pop.1986 total_suicide_ct.1986
## Albania                     0              0                     0
##         total_pop.1990 total_suicide_ct.1990 total_pop.1991
## Albania              0                     0              0
##         total_suicide_ct.1991 total_pop.2012 total_suicide_ct.2012
## Albania                     0              0                     0
##         total_pop.2013 total_suicide_ct.2013 total_pop.2014
## Albania              0                     0              0
##         total_suicide_ct.2014 total_pop.2015 total_suicide_ct.2015
## Albania                     0              0                     0
##         total_pop.2011 total_suicide_ct.2011 total_pop.2016
## Albania              0                     0              0
##         total_suicide_ct.2016
## Albania                     0
total_df <- data.frame(NA, NA, NA, NA)
names(total_df) <- c("total_pop", "total_suicide_ct")
total_df <- na.omit(total_df) 
for(i in unique(temp$country)){
  tdf <- data.frame(sum(temp[temp$country == i, 'pop']) ,sum(temp[temp$country == i, 'suicidect']))
  colnames(tdf) <- c("total_pop", "total_suicide_ct")
  total_df <- rbind(total_df, tdf)
}

rm(tdf)
rm(i)
rm(j)
## Warning in rm(j): object 'j' not found
rownames(total_df) <- unique(temp$country)
df <- cbind(df, total_df)
scaled_df <- data.frame(scale(df))
head(scaled_df,1)
##         total_pop.1987 total_suicide_ct.1987 total_pop.1988
## Albania     -0.2698369            -0.2838701     -0.2533746
##         total_suicide_ct.1988 total_pop.1989 total_suicide_ct.1989
## Albania            -0.2795608     -0.2826843            -0.2809674
##         total_pop.1992 total_suicide_ct.1992 total_pop.1993
## Albania     -0.3660521            -0.3358863     -0.3537427
##         total_suicide_ct.1993 total_pop.1994 total_suicide_ct.1994
## Albania            -0.3087522     -0.3551332            -0.3062697
##         total_pop.1995 total_suicide_ct.1995 total_pop.1996
## Albania     -0.3629368            -0.3186826     -0.3776569
##         total_suicide_ct.1996 total_pop.1997 total_suicide_ct.1997
## Albania              -0.33355     -0.3819605            -0.3226696
##         total_pop.1998 total_suicide_ct.1998 total_pop.1999
## Albania     -0.3836518            -0.3350752     -0.3936341
##         total_suicide_ct.1999 total_pop.2000 total_suicide_ct.2000
## Albania            -0.3319577     -0.3986281            -0.3462654
##         total_pop.2001 total_suicide_ct.2001 total_pop.2002
## Albania     -0.3854671            -0.3290675     -0.3966008
##         total_suicide_ct.2002 total_pop.2003 total_suicide_ct.2003
## Albania            -0.3381362     -0.3964685            -0.3468724
##         total_pop.2004 total_suicide_ct.2004 total_pop.2005
## Albania     -0.3710656            -0.3315149     -0.3675342
##         total_suicide_ct.2005 total_pop.2006 total_suicide_ct.2006
## Albania             -0.354213       -0.38778            -0.3663077
##         total_pop.2007 total_suicide_ct.2007 total_pop.2008
## Albania     -0.3908419            -0.3454446      -0.388568
##         total_suicide_ct.2008 total_pop.2009 total_suicide_ct.2009
## Albania            -0.3492111     -0.4122316            -0.3830215
##         total_pop.2010 total_suicide_ct.2010 total_pop.1985
## Albania     -0.4139084             -0.371006      -0.340438
##         total_suicide_ct.1985 total_pop.1986 total_suicide_ct.1986
## Albania            -0.2876006      -0.344379            -0.2832239
##         total_pop.1990 total_suicide_ct.1990 total_pop.1991
## Albania      -0.428922            -0.3419059     -0.4308832
##         total_suicide_ct.1991 total_pop.2012 total_suicide_ct.2012
## Albania            -0.3493309     -0.4571275            -0.3847174
##         total_pop.2013 total_suicide_ct.2013 total_pop.2014
## Albania     -0.4491358            -0.3760499     -0.4507231
##         total_suicide_ct.2014 total_pop.2015 total_suicide_ct.2015
## Albania            -0.3780522     -0.4138309            -0.3459816
##         total_pop.2011 total_suicide_ct.2011 total_pop.2016
## Albania      -0.475017            -0.3864962     -0.2220068
##         total_suicide_ct.2016  total_pop total_suicide_ct
## Albania            -0.2783145 -0.3970021       -0.3559822

Making Clusters

library(factoextra)
distance <- get_dist(scaled_df[, 65:66])
fviz_dist(distance, show_labels = TRUE, lab_size = NULL,
          gradient = list(low = "blue", below_average = "yellow", average = "green", high = "red", extremely_high = "black"))

Creating Model

Taking 6 clusters as starting point
Animation of steps to classify data in clusters

library(animation)
library(heatmap3)

kmeans.ani(scaled_df[, 65:66], centers = 4)

km <- kmeans(scaled_df[, 65:66], centers = 4, nstart = 30)
str(km)
## List of 9
##  $ cluster     : Named int [1:101] 3 3 4 3 3 4 3 3 3 3 ...
##   ..- attr(*, "names")= chr [1:101] "Albania" "Antigua and Barbuda" "Argentina" "Armenia" ...
##  $ centers     : num [1:4, 1:2] 4.126 1.487 -0.358 0.364 5.216 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:2] "total_pop" "total_suicide_ct"
##  $ totss       : num 200
##  $ withinss    : num [1:4] 12.55 9.34 1.7 1.4
##  $ tot.withinss: num 25
##  $ betweenss   : num 175
##  $ size        : int [1:4] 3 8 79 11
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
km
## K-means clustering with 4 clusters of sizes 3, 8, 79, 11
## 
## Cluster means:
##    total_pop total_suicide_ct
## 1  4.1264518       5.21555114
## 2  1.4865704       0.87433520
## 3 -0.3578772      -0.29569613
## 4  0.3636707       0.06533262
## 
## Clustering vector:
##                      Albania          Antigua and Barbuda 
##                            3                            3 
##                    Argentina                      Armenia 
##                            4                            3 
##                        Aruba                    Australia 
##                            3                            4 
##                      Austria                   Azerbaijan 
##                            3                            3 
##                      Bahamas                      Bahrain 
##                            3                            3 
##                     Barbados                      Belarus 
##                            3                            3 
##                      Belgium                       Belize 
##                            3                            3 
##       Bosnia and Herzegovina                       Brazil 
##                            3                            2 
##                     Bulgaria                   Cabo Verde 
##                            3                            3 
##                       Canada                        Chile 
##                            4                            3 
##                     Colombia                   Costa Rica 
##                            4                            3 
##                      Croatia                         Cuba 
##                            3                            3 
##                       Cyprus               Czech Republic 
##                            3                            3 
##                      Denmark                     Dominica 
##                            3                            3 
##                      Ecuador                  El Salvador 
##                            3                            3 
##                      Estonia                         Fiji 
##                            3                            3 
##                      Finland                       France 
##                            3                            2 
##                      Georgia                      Germany 
##                            3                            2 
##                       Greece                      Grenada 
##                            3                            3 
##                    Guatemala                       Guyana 
##                            3                            3 
##                      Hungary                      Iceland 
##                            3                            3 
##                      Ireland                       Israel 
##                            3                            3 
##                        Italy                      Jamaica 
##                            2                            3 
##                        Japan                   Kazakhstan 
##                            1                            4 
##                     Kiribati                       Kuwait 
##                            3                            3 
##                   Kyrgyzstan                       Latvia 
##                            3                            3 
##                    Lithuania                   Luxembourg 
##                            3                            3 
##                        Macau                     Maldives 
##                            3                            3 
##                        Malta                    Mauritius 
##                            3                            3 
##                       Mexico                     Mongolia 
##                            2                            3 
##                   Montenegro                  Netherlands 
##                            3                            3 
##                  New Zealand                    Nicaragua 
##                            3                            3 
##                       Norway                         Oman 
##                            3                            3 
##                       Panama                     Paraguay 
##                            3                            3 
##                  Philippines                       Poland 
##                            4                            4 
##                     Portugal                  Puerto Rico 
##                            3                            3 
##                        Qatar            Republic of Korea 
##                            3                            2 
##                      Romania           Russian Federation 
##                            4                            1 
##        Saint Kitts and Nevis                  Saint Lucia 
##                            3                            3 
## Saint Vincent and Grenadines                   San Marino 
##                            3                            3 
##                       Serbia                   Seychelles 
##                            3                            3 
##                    Singapore                     Slovakia 
##                            3                            3 
##                     Slovenia                 South Africa 
##                            3                            4 
##                        Spain                    Sri Lanka 
##                            4                            3 
##                     Suriname                       Sweden 
##                            3                            3 
##                  Switzerland                     Thailand 
##                            3                            4 
##          Trinidad and Tobago                       Turkey 
##                            3                            3 
##                 Turkmenistan                      Ukraine 
##                            3                            2 
##         United Arab Emirates               United Kingdom 
##                            3                            2 
##                United States                      Uruguay 
##                            1                            3 
##                   Uzbekistan 
##                            3 
## 
## Within cluster sum of squares by cluster:
## [1] 12.545821  9.338014  1.701099  1.404262
##  (between_SS / total_SS =  87.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
fviz_cluster(km, data = scaled_df[, 65:66])

heatmap3(matrix(scaled_df$total_suicide_ct, km$cluster))

Finding optimal number of clusters

#Elbow Method
set.seed(300)
fviz_nbclust(scaled_df[, 65:66], kmeans, method="wss") + geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

#Silhouette Method
set.seed(300)
fviz_nbclust(scaled_df[, 65:66], kmeans, method="silhouette") + labs(subtitle = "Silhouette method")

#Gap Method
library(cluster)
gap_stat <- clusGap(scaled_df[, 65:66], FUN = kmeans, nstart = 30, K.max = 6, B = 50)
set.seed(300)
fviz_gap_stat(gap_stat) + labs(subtitle = "Gap-Statisitc method")

library(NbClust)
NbClust(scaled_df[, 65:66], distance = "euclidean", method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 8 proposed 14 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##         KL       CH Hartigan     CCC    Scott   Marriot   TrCovW  TraceW
## 2   2.6117 230.0293  99.8983  0.1649 144.6672 2842.5042 263.2695 60.1770
## 3   3.0197 278.1834  19.4650  1.4248 300.3264 1369.4856 222.9970 29.9526
## 4   1.2651 226.4452  34.7739  0.2665 338.1373 1674.3658 202.0378 24.9892
## 5   0.8136 236.9435   8.0957  0.9380 376.0461 1797.4806  93.8094 18.3948
## 6   0.6317 205.0013   1.1610 -0.4705 394.5005 2156.1259  87.8465 16.9642
## 7   1.0148 171.2935   1.0316 -2.2689 398.3723 2824.3539  88.3630 16.7594
## 8   0.9360 147.0010   0.1421 -3.8157 400.4381 3614.2657  87.8423 16.5774
## 9   2.4763 127.4548  14.9629 -5.2720 400.8860 4554.0638  87.7786 16.5521
## 10  0.4422 131.9320  -0.4015 -5.0019 454.8571 3294.8926  64.0149 14.2367
## 11  1.3235 116.8761   1.6115 -6.2722 453.7221 4031.8754  61.7071 14.2998
## 12  0.8162 107.0967   0.1103 -7.2171 460.3015 4495.6576  62.9542 14.0482
## 13  0.1085  97.1983 226.1515 -8.2603 460.8786 5246.0919  62.9249 14.0308
## 14 33.3059 333.8559   0.1565  3.9755 689.0486  635.4579   4.6682  3.9303
## 15  0.9876 307.0081   0.0698  3.0120 689.5247  726.0493   4.6483  3.9233
##    Friedman   Rubin Cindex     DB Silhouette     Duda Pseudot2   Beale
## 2    5.9240  3.3235 0.0972 0.5257     0.8853   0.8758   1.5599  0.0945
## 3   17.6053  6.6772 0.0700 0.6505     0.7851   1.7443  -3.8404 -0.2134
## 4   21.8784  8.0035 0.0437 0.8294     0.6751   3.4670 -54.0788 -0.7024
## 5   23.5831 10.8726 0.0399 0.6515     0.6727   2.1755 -36.2020 -0.5332
## 6   26.3227 11.7895 0.0246 0.7002     0.6127   1.5507  -8.1675 -0.3044
## 7   27.0722 11.9336 0.0216 0.7550     0.5000   0.7522   5.9289  0.3088
## 8   27.3485 12.0646 0.0187 0.8516     0.5035  75.7689 -12.8284 -0.8458
## 9   27.4338 12.0830 0.0197 0.8879     0.4464   1.1682  -0.7199 -0.1309
## 10  41.2026 14.0482 0.0150 0.7763     0.5283   5.6490  -9.0528 -0.4115
## 11  40.8996 13.9862 0.0134 0.8108     0.4940 129.9308  -7.9384 -0.8505
## 12  42.9912 14.2367 0.0124 0.7490     0.5046 185.9366  -4.9731 -0.8289
## 13  43.1931 14.2543 0.0129 0.7208     0.4956   0.2742   2.6476  1.3238
## 14 119.2248 50.8865 0.0187 0.6635     0.5062   0.6686   6.9383  0.4575
## 15 119.5790 50.9781 0.0184 0.6862     0.4939   4.1354 -25.0201 -0.7311
##    Ratkowsky    Ball Ptbiserial     Frey McClain   Dunn Hubert  SDindex
## 2     0.5912 30.0885     0.8842  12.3143  0.0070 0.3567 0.0104 213.9798
## 3     0.5318  9.9842     0.7221   8.7587  0.0265 0.0482 0.0101 123.8242
## 4     0.4673  6.2473     0.5953   8.4034  0.0456 0.0297 0.0101  98.5980
## 5     0.4261  3.6790     0.5714   9.2239  0.0498 0.0152 0.0102  67.8472
## 6     0.3905  2.8274     0.4307  26.3424  0.0850 0.0199 0.0103  61.8064
## 7     0.3617  2.3942     0.3011   3.5458  0.1970 0.0045 0.0104  69.5797
## 8     0.3385  2.0722     0.2821 -17.8328  0.2065 0.0031 0.0104  70.9716
## 9     0.3192  1.8391     0.2416  -4.7044  0.3113 0.0040 0.0104  86.4207
## 10    0.3047  1.4237     0.2944  14.8357  0.1549 0.0045 0.0103  60.7570
## 11    0.2905  1.3000     0.2406   0.5687  0.2277 0.0040 0.0104  83.5982
## 12    0.2783  1.1707     0.2396 -18.8482  0.2146 0.0040 0.0104  84.5265
## 13    0.2674  1.0793     0.2122  -0.1163  0.2942 0.0029 0.0104 119.7723
## 14    0.2646  0.2807     0.2144  19.8254  0.2306 0.0053 0.0104  98.2165
## 15    0.2557  0.2616     0.2041 -15.5083  0.2538 0.0049 0.0104 125.6693
##    Dindex   SDbw
## 2  0.4897 1.8469
## 3  0.3249 1.1073
## 4  0.2603 1.0653
## 5  0.2271 0.6556
## 6  0.1874 0.6059
## 7  0.1705 0.6416
## 8  0.1621 0.6213
## 9  0.1594 0.6833
## 10 0.1358 0.4410
## 11 0.1328 0.5230
## 12 0.1261 0.4546
## 13 0.1231 0.4488
## 14 0.0855 0.2194
## 15 0.0842 0.2339
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2         -0.5522           -30.9205       0.9118
## 3         -0.7431           -21.1118       1.0000
## 4          0.4040           112.0981       1.0000
## 5          0.4005           100.3069       1.0000
## 6         -0.1908          -143.5463       1.0000
## 7          0.0832           198.4289       0.7366
## 8         -0.1908           -81.1349       1.0000
## 9         -0.0307          -167.9011       1.0000
## 10        -0.7431           -25.8033       1.0000
## 11        -0.1908           -49.9292       1.0000
## 12        -0.2510           -24.9172       1.0000
## 13        -0.7431            -2.3458       0.4303
## 14         0.0222           618.0234       0.6383
## 15         0.2234           114.6927       1.0000
## 
## $Best.nc
##                      KL       CH Hartigan     CCC  Scott  Marriot   TrCovW
## Number_clusters 14.0000  14.0000  13.0000 14.0000  14.00   14.000   5.0000
## Value_Index     33.3059 333.8559 226.0412  3.9755 228.17 4701.225 108.2284
##                 TraceW Friedman    Rubin  Cindex     DB Silhouette   Duda
## Number_clusters  3.000  14.0000  14.0000 12.0000 2.0000     2.0000 2.0000
## Value_Index     25.261  76.0318 -36.5407  0.0124 0.5257     0.8853 0.8758
##                 PseudoT2  Beale Ratkowsky    Ball PtBiserial   Frey
## Number_clusters   4.0000 2.0000    2.0000  3.0000     2.0000 7.0000
## Value_Index     -54.0788 0.0945    0.5912 20.1043     0.8842 3.5458
##                 McClain   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters   2.000 2.0000      0  10.000      0 14.0000
## Value_Index       0.007 0.3567      0  60.757      0  0.2194
## 
## $Best.partition
##                      Albania          Antigua and Barbuda 
##                            2                            2 
##                    Argentina                      Armenia 
##                            2                            2 
##                        Aruba                    Australia 
##                            2                            2 
##                      Austria                   Azerbaijan 
##                            2                            2 
##                      Bahamas                      Bahrain 
##                            2                            2 
##                     Barbados                      Belarus 
##                            2                            2 
##                      Belgium                       Belize 
##                            2                            2 
##       Bosnia and Herzegovina                       Brazil 
##                            2                            1 
##                     Bulgaria                   Cabo Verde 
##                            2                            2 
##                       Canada                        Chile 
##                            2                            2 
##                     Colombia                   Costa Rica 
##                            2                            2 
##                      Croatia                         Cuba 
##                            2                            2 
##                       Cyprus               Czech Republic 
##                            2                            2 
##                      Denmark                     Dominica 
##                            2                            2 
##                      Ecuador                  El Salvador 
##                            2                            2 
##                      Estonia                         Fiji 
##                            2                            2 
##                      Finland                       France 
##                            2                            2 
##                      Georgia                      Germany 
##                            2                            2 
##                       Greece                      Grenada 
##                            2                            2 
##                    Guatemala                       Guyana 
##                            2                            2 
##                      Hungary                      Iceland 
##                            2                            2 
##                      Ireland                       Israel 
##                            2                            2 
##                        Italy                      Jamaica 
##                            2                            2 
##                        Japan                   Kazakhstan 
##                            1                            2 
##                     Kiribati                       Kuwait 
##                            2                            2 
##                   Kyrgyzstan                       Latvia 
##                            2                            2 
##                    Lithuania                   Luxembourg 
##                            2                            2 
##                        Macau                     Maldives 
##                            2                            2 
##                        Malta                    Mauritius 
##                            2                            2 
##                       Mexico                     Mongolia 
##                            2                            2 
##                   Montenegro                  Netherlands 
##                            2                            2 
##                  New Zealand                    Nicaragua 
##                            2                            2 
##                       Norway                         Oman 
##                            2                            2 
##                       Panama                     Paraguay 
##                            2                            2 
##                  Philippines                       Poland 
##                            2                            2 
##                     Portugal                  Puerto Rico 
##                            2                            2 
##                        Qatar            Republic of Korea 
##                            2                            2 
##                      Romania           Russian Federation 
##                            2                            1 
##        Saint Kitts and Nevis                  Saint Lucia 
##                            2                            2 
## Saint Vincent and Grenadines                   San Marino 
##                            2                            2 
##                       Serbia                   Seychelles 
##                            2                            2 
##                    Singapore                     Slovakia 
##                            2                            2 
##                     Slovenia                 South Africa 
##                            2                            2 
##                        Spain                    Sri Lanka 
##                            2                            2 
##                     Suriname                       Sweden 
##                            2                            2 
##                  Switzerland                     Thailand 
##                            2                            2 
##          Trinidad and Tobago                       Turkey 
##                            2                            2 
##                 Turkmenistan                      Ukraine 
##                            2                            2 
##         United Arab Emirates               United Kingdom 
##                            2                            2 
##                United States                      Uruguay 
##                            1                            2 
##                   Uzbekistan 
##                            2

So we stick with 4 clusters and hence we have a robust model. Calculating goodness

  goodness_km <- km$betweenss / km$totss
  goodness_km
## [1] 0.875054

Validating clusters using validation sets

clusters_df <- data.frame(NA, NA)
colnames(clusters_df) <- c("country", "cluster")
clusters_df <- na.omit(clusters_df)
for(i in levels(factor(km$cluster))){
  rnames <- rownames(df[km$cluster == i, ])
  for(j in rnames){
    tempdf <- data.frame(j, i)
    colnames(tempdf) <- c("country", "cluster")
    clusters_df <- rbind(clusters_df, tempdf)
  }
}
rownames(clusters_df) <- clusters_df$country
clusters_df <- clusters_df[-1]

df <- merge(clusters_df,df, by='row.names', all=TRUE)
scaled_df <- merge(clusters_df, scaled_df, by="row.names", all = TRUE)

Train and Test Data

library(caTools)
library(dplyr)
set.seed(123)
n = nrow(scaled_df)
split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.8, 0.2))
training_set <- scaled_df[split, c(1,2,67,68)]
test_set <- scaled_df[!split, c(1,2,67,68)]
str(test_set)
## 'data.frame':    18 obs. of  4 variables:
##  $ Row.names       : 'AsIs' chr  "Armenia" "Aruba" "Azerbaijan" "Barbados" ...
##  $ cluster         : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 2 3 4 3 3 3 ...
##  $ total_pop       : num  -0.384 -0.451 -0.353 -0.447 3.872 ...
##  $ total_suicide_ct: num  -0.356 -0.366 -0.358 -0.366 0.877 ...

Predicting Clusters for training data

library(clue)
pred <- cl_predict(km, training_set[,3:4])
str(pred)
##  'cl_class_ids' int [1:83] 3 3 4 4 3 3 3 3 3 3 ...
result_df <- data.frame(cbind(original_cluster = training_set$cluster, 
                   predicted_cluster = pred))
rownames(result_df) <- training_set$Row.names
head(result_df, 10)
##                     original_cluster predicted_cluster
## Albania                            3                 3
## Antigua and Barbuda                3                 3
## Argentina                          4                 4
## Australia                          4                 4
## Austria                            3                 3
## Bahamas                            3                 3
## Bahrain                            3                 3
## Belarus                            3                 3
## Belgium                            3                 3
## Belize                             3                 3
correctly_predicted <- nrow(result_df[result_df$original_cluster == result_df$predicted_cluster, ])
accuracy_rate <- correctly_predicted / nrow(result_df)
accuracy_rate
## [1] 1